{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CalIt2 building people counts dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "from typing import Final\n",
    "from config import data_raw_folder, data_processed_folder\n",
    "from timeeval import Datasets\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rcParams['figure.figsize'] = (20, 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/UCI ML Repository/CalIt2 and\n",
      "saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed\n"
     ]
    }
   ],
   "source": [
    "dataset_collection_name = \"CalIt2\"\n",
    "source_folder = os.path.join(data_raw_folder, \"UCI ML Repository/CalIt2\")\n",
    "target_folder = data_processed_folder\n",
    "\n",
    "from pathlib import Path\n",
    "print(f\"Looking for source datasets in {Path(source_folder).absolute()} and\\nsaving processed datasets in {Path(target_folder).absolute()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Directories /home/projects/akita/data/benchmark-data/data-processed/multivariate/CalIt2 already exist\n"
     ]
    }
   ],
   "source": [
    "dataset_name = \"CalIt2-traffic\"\n",
    "train_type = \"unsupervised\"\n",
    "train_is_normal = False\n",
    "input_type = \"multivariate\"\n",
    "datetime_index = True\n",
    "dataset_type = \"real\"\n",
    "\n",
    "# create target directory\n",
    "dataset_subfolder = os.path.join(input_type, dataset_collection_name)\n",
    "target_subfolder = os.path.join(target_folder, dataset_subfolder)\n",
    "try:\n",
    "    os.makedirs(target_subfolder)\n",
    "    print(f\"Created directories {target_subfolder}\")\n",
    "except FileExistsError:\n",
    "    print(f\"Directories {target_subfolder} already exist\")\n",
    "    pass\n",
    "\n",
    "dm = Datasets(target_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed dataset CalIt2-traffic -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/CalIt2/CalIt2-traffic.test.csv\n"
     ]
    }
   ],
   "source": [
    "# transform data \n",
    "df = pd.read_csv(os.path.join(source_folder, \"CalIt2.data\"), header=None)\n",
    "df.columns = [\"id\", \"date\", \"time\", \"count\"]\n",
    "df.insert(0, \"timestamp\", pd.to_datetime(df[\"date\"] + \" \" + df[\"time\"]))\n",
    "df = df.drop(columns=[\"date\", \"time\"])\n",
    "\n",
    "# in flow\n",
    "df_in = df[df[\"id\"] == 9]\n",
    "df_in = df_in.drop(columns=[\"id\"])\n",
    "df_in.columns = [\"timestamp\", \"in_count\"]\n",
    "\n",
    "# out flow\n",
    "df_out = df[df[\"id\"] == 7]\n",
    "df_out = df_out.drop(columns=[\"id\"])\n",
    "df_out.columns = [\"timestamp\", \"out_count\"]\n",
    "\n",
    "df = pd.merge(df_in, df_out, on=\"timestamp\", how=\"inner\")\n",
    "\n",
    "# read and add labels\n",
    "df_events = pd.read_csv(os.path.join(source_folder, \"CalIt2.events\"), header=None)\n",
    "df_events.columns = [\"date\", \"begin\", \"end\", \"event_type\"]\n",
    "df_events.insert(0, \"begin_timestamp\", pd.to_datetime(df_events[\"date\"] + \" \" + df_events[\"begin\"]))\n",
    "df_events.insert(1, \"end_timestamp\", pd.to_datetime(df_events[\"date\"] + \" \" + df_events[\"end\"]))\n",
    "df_events = df_events.drop(columns=[\"date\", \"begin\", \"end\", \"event_type\"])\n",
    "# labelling\n",
    "df[\"is_anomaly\"] = 0\n",
    "for _, (t1, t2) in df_events.iterrows():\n",
    "    tmp = df[df[\"timestamp\"] >= t1]\n",
    "    tmp = tmp[tmp[\"timestamp\"] <= t2]\n",
    "    df.loc[tmp.index, \"is_anomaly\"] = 1\n",
    "\n",
    "filename = f\"{dataset_name}.test.csv\"\n",
    "path = os.path.join(dataset_subfolder, filename)\n",
    "target_filepath = os.path.join(target_subfolder, filename)\n",
    "dataset_length = len(df)\n",
    "df.to_csv(target_filepath, index=False)\n",
    "print(f\"Processed dataset {dataset_name} -> {target_filepath}\")\n",
    "\n",
    "# save metadata\n",
    "dm.add_dataset((dataset_collection_name, dataset_name),\n",
    "    train_path = None,\n",
    "    test_path = path,\n",
    "    dataset_type = dataset_type,\n",
    "    datetime_index = datetime_index,\n",
    "    split_at = None,\n",
    "    train_type = train_type,\n",
    "    train_is_normal = train_is_normal,\n",
    "    input_type = input_type,\n",
    "    dataset_length = dataset_length\n",
    ")\n",
    "\n",
    "dm.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>train_path</th>\n",
       "      <th>test_path</th>\n",
       "      <th>dataset_type</th>\n",
       "      <th>datetime_index</th>\n",
       "      <th>split_at</th>\n",
       "      <th>train_type</th>\n",
       "      <th>train_is_normal</th>\n",
       "      <th>input_type</th>\n",
       "      <th>length</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>collection_name</th>\n",
       "      <th>dataset_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CalIt2</th>\n",
       "      <th>CalIt2-traffic</th>\n",
       "      <td>NaN</td>\n",
       "      <td>multivariate/CalIt2/CalIt2-traffic.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>True</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unsupervised</td>\n",
       "      <td>False</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>5040</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               train_path  \\\n",
       "collection_name dataset_name                \n",
       "CalIt2          CalIt2-traffic        NaN   \n",
       "\n",
       "                                                                  test_path  \\\n",
       "collection_name dataset_name                                                  \n",
       "CalIt2          CalIt2-traffic  multivariate/CalIt2/CalIt2-traffic.test.csv   \n",
       "\n",
       "                               dataset_type  datetime_index  split_at  \\\n",
       "collection_name dataset_name                                            \n",
       "CalIt2          CalIt2-traffic         real            True       NaN   \n",
       "\n",
       "                                  train_type  train_is_normal    input_type  \\\n",
       "collection_name dataset_name                                                  \n",
       "CalIt2          CalIt2-traffic  unsupervised            False  multivariate   \n",
       "\n",
       "                                length  \n",
       "collection_name dataset_name            \n",
       "CalIt2          CalIt2-traffic    5040  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dm.refresh()\n",
    "dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Experimentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(os.path.join(source_folder, \"CalIt2.data\"), header=None)\n",
    "df.columns = [\"id\", \"date\", \"time\", \"count\"]\n",
    "df.insert(0, \"timestamp\", pd.to_datetime(df[\"date\"] + \" \" + df[\"time\"]))\n",
    "df = df.drop(columns=[\"date\", \"time\"])\n",
    "\n",
    "# in flow\n",
    "df_in = df[df[\"id\"] == 9]\n",
    "df_in = df_in.drop(columns=[\"id\"])\n",
    "df_in.columns = [\"timestamp\", \"in_count\"]\n",
    "\n",
    "# out flow\n",
    "df_out = df[df[\"id\"] == 7]\n",
    "df_out = df_out.drop(columns=[\"id\"])\n",
    "df_out.columns = [\"timestamp\", \"out_count\"]\n",
    "\n",
    "df = pd.merge(df_in, df_out, on=\"timestamp\", how=\"inner\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_events = pd.read_csv(os.path.join(source_folder, \"CalIt2.events\"), header=None)\n",
    "df_events.columns = [\"date\", \"begin\", \"end\", \"event_type\"]\n",
    "df_events.insert(0, \"begin_timestamp\", pd.to_datetime(df_events[\"date\"] + \" \" + df_events[\"begin\"]))\n",
    "df_events.insert(1, \"end_timestamp\", pd.to_datetime(df_events[\"date\"] + \" \" + df_events[\"end\"]))\n",
    "df_events = df_events.drop(columns=[\"date\", \"begin\", \"end\", \"event_type\"])\n",
    "df_events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# labelling\n",
    "df[\"is_anomaly\"] = 0\n",
    "for _, (t1, t2) in df_events.iterrows():\n",
    "    tmp = df[df[\"timestamp\"] >= t1]\n",
    "    tmp = tmp[tmp[\"timestamp\"] <= t2]\n",
    "    df.loc[tmp.index, \"is_anomaly\"] = 1\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_plot = df.iloc[2500:3000].copy()\n",
    "df_plot = df_plot.set_index(\"timestamp\")\n",
    "df_plot.plot(y=[\"in_count\", \"out_count\"])\n",
    "df_plot[\"is_anomaly\"].plot(secondary_y=True)\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "timeeval",
   "language": "python",
   "name": "timeeval"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
